import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


base_df = pd.read_csv('HR_capstone_dataset.csv')
base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


base_df.head()


base_df.duplicated().sum()

3008


base_df.drop_duplicates(inplace = True)


base_df.describe()


base_df['left'].value_counts()

0    10000
1     1991
Name: left, dtype: int64


round(base_df['left'].value_counts(normalize = True) *100,2)

0    83.4
1    16.6
Name: left, dtype: float64


def percentage_analysis(df, level_columns, grid_column = None, variable = 'left', label_type = 1, rot = 0):
    """
    Description:
        Returns an horizontal bar plot (barh) and a dataset representing the percentage in value of a selected target variable
    Arguments:
        df: A pandas dataset
        level_columns: a list of strings of existing columns on the dataset (df). The column values are used to compare differences in percentage on the target variable.
        grid_column: a string representing an existing column on the dataset. The column values are used to create subplots. Its use is optional, Default: None
        variable: a string representing an existing column on the dataset, its recomended to use categorical variables. The column represents the target variable being analysed. Default: 'left'.
        label_type: Argument used to configure labeling for the horizontal bars. 0 = no label, 1 = only percentages, 2 = percentages and count. Default: 1.
        rot:  Argument used to configure labeling for the horizontal bars. Integer representing the label text rotation preferred
    """
    subset_output = {}
    df_temp = df.copy()
    levels_variable = df_temp[variable].unique()
    levels_variable.sort()
    if grid_column == None:
        df_temp['temp'] = True
        grid_column = 'temp'
    grid_col = df_temp[grid_column].unique()
    grid_col.sort()
    fig = plt.figure(figsize=(4 * len(level_columns), 7 * len(grid_col)))
    sfigs = fig.subfigures(1, len(level_columns))
    for lcol_indx, sfig in enumerate(sfigs if len(level_columns) > 1 else range(1)):
        if len(level_columns) > 1:
            axs = sfig.subplots(nrows = len(grid_col))
        else:
            axs = sfigs.subplots(nrows = len(grid_col))
        level_col = level_columns[lcol_indx]
        level_col_vals = df_temp[level_col].unique()
        for indx, grid_item in enumerate(grid_col):
            table_dict = {'level':[]}
            for val in levels_variable:
                table_dict[str(val)+'_count'] = []
                table_dict[str(val)+'_percent'] = []
                table_dict[str(val)+'_label'] = []
            for level in level_col_vals:
                table_dict['level'].append(str(level))
                total_level =  df_temp.loc[(df_temp[grid_column]==grid_item)&(df_temp[level_col]==level), level_col].count()
                for val in levels_variable:
                    temp_count = df_temp.loc[(df_temp[grid_column]==grid_item)&(df_temp[level_col]==level)&(df_temp[variable]==val),variable].count()
                    temp_percent = 0 if temp_count == 0 else round(temp_count/(total_level)*100,2)
                    if label_type == 0 or temp_count == 0:
                        temp_label = ''
                    elif label_type == 1:
                        temp_label = str(temp_percent) + '%'
                    elif label_type == 2:
                        temp_label = str(temp_percent) + '% / ' + str(temp_count)
                    table_dict[str(val)+'_count'].append(temp_count)
                    table_dict[str(val)+'_percent'].append(temp_percent)
                    table_dict[str(val)+'_label'].append(temp_label)       
            plot_df = pd.DataFrame(table_dict)
            plot_df.sort_values(by = 'level', ascending = False, inplace = True)
            subset_output[str(level_columns[lcol_indx])+'_'+str(grid_item)] = plot_df.copy()
            plot_df['left_value'] = 0
            if grid_column == 'temp':
                axs.set_title(f'Target variable: {variable}\nPercentage by: {level_col}')
                for val in levels_variable:
                    b1 = axs.barh(plot_df['level'], plot_df[str(val)+'_percent'], left = plot_df['left_value'], label = val)
                    plot_df['left_value'] = plot_df['left_value'] + plot_df[str(val)+'_percent']
                    axs.bar_label(b1, labels = plot_df[str(val)+'_label'], label_type='center', rotation=rot)
                axs.set_ylabel(level_col) 
                leg = axs.legend()

            else:
                axs[indx].set_title(f'Target variable: {variable}\nPercentage by:{level_col}\nSubplot - {grid_column}: {grid_item}')
                for val in levels_variable:
                    b1 = axs[indx].barh(plot_df['level'], plot_df[str(val)+'_percent'], left = plot_df['left_value'], label = val)
                    plot_df['left_value'] = plot_df['left_value'] + plot_df[str(val)+'_percent']
                    axs[indx].bar_label(b1, labels = plot_df[str(val)+'_label'], label_type='center', rotation=rot)
                axs[indx].set_ylabel(level_col) 
                leg = axs[indx].legend()
    return fig, subset_output

def convert_to_quartile(df, target_col = []):
    """
    Description:
        Adds to a provided pandas dataframe, columns representing the quartile value of the selected numerical columns.
        The new inserted columns will include the values:
            1. min to 24.9%
            2. 25% to 49.9%
            3. 50% to 74.9%
            4. 75% to max.
        The new columns will be named as: column name + '_quartile'
    Arguments:
        df: A pandas dataset
        target_col: a list of string representing the columns to be trnsformed to quartile value
    """
    extended_df = df.copy()
    quartiles_list = [0, 0.25, 0.50, 0.75]
    for col in target_col:
        col_name = col + '_quartile'
        for indx, quantile in enumerate(quartiles_list):
            min_val = quantile
            max_val = 1 if quantile == 0.75 else quartiles_list[indx+1]
            extended_df.loc[(extended_df[col] >= extended_df[col].quantile(min_val)) & (extended_df[col] <= extended_df[col].quantile(max_val)),col_name] = indx+1
    return extended_df

def convert_to_zscore(df, target_col = []):
    """
    Description:
        Adds to a provided pandas dataframe, columns representing the z-score value of the selected numerical columns.
        The new columns will be named as: column name + '_standardized'
    Arguments:
        df: A pandas dataset
        target_col: a list of string representing the columns to be trnsformed to quartile value
    """
    extended_df = df.copy()
    for col in target_col:
        col_name = col + '_standardized'
        extended_df[col_name] = (extended_df[col] -extended_df[col].mean()) / extended_df[col].std() 
    return extended_df


help(percentage_analysis)

Help on function percentage_analysis in module __main__:

percentage_analysis(df, level_columns, grid_column=None, variable='left', label_type=1, rot=0)
    Description:
        Returns an horizontal bar plot (barh) and a dataset representing the percentage in value of a selected target variable
    Arguments:
        df: A pandas dataset
        level_columns: a list of strings of existing columns on the dataset (df). The column values are used to compare differences in percentage on the target variable.
        grid_column: a string representing an existing column on the dataset. The column values are used to create subplots. Its use is optional, Default: None
        variable: a string representing an existing column on the dataset, its recomended to use categorical variables. The column represents the target variable being analysed. Default: 'left'.
        label_type: Argument used to configure labeling for the horizontal bars. 0 = no label, 1 = only percentages, 2 = percentages and count. Default: 1.
        rot:  Argument used to configure labeling for the horizontal bars. Integer representing the label text rotation preferred


help(convert_to_quartile)

Help on function convert_to_quartile in module __main__:

convert_to_quartile(df, target_col=[])
    Description:
        Adds to a provided pandas dataframe, columns representing the quartile value of the selected numerical columns.
        The new inserted columns will include the values:
            1. min to 24.9%
            2. 25% to 49.9%
            3. 50% to 74.9%
            4. 75% to max.
        The new columns will be named as: column name + '_quartile'
    Arguments:
        df: A pandas dataset
        target_col: a list of string representing the columns to be trnsformed to quartile value


help(convert_to_zscore)

Help on function convert_to_zscore in module __main__:

convert_to_zscore(df, target_col=[])
    Description:
        Adds to a provided pandas dataframe, columns representing the z-score value of the selected numerical columns.
        The new columns will be named as: column name + '_standardized'
    Arguments:
        df: A pandas dataset
        target_col: a list of string representing the columns to be trnsformed to quartile value


base_df_2 = convert_to_quartile(base_df, ['satisfaction_level', 'last_evaluation', 'average_montly_hours', 'time_spend_company'])
base_df_2.head()


base_df_2.groupby('satisfaction_level_quartile').agg({'satisfaction_level':['min', 'max', 'mean', 'count']})


base_df_2.groupby('time_spend_company_quartile').agg({'time_spend_company':['min', 'max', 'mean', 'count']})


upper_limit = 4 + 1.5 
lower_limit = 3 - 1.5 
print(f"Lower limit: {lower_limit}, upper limit: {upper_limit}")

Lower limit: 1.5, upper limit: 5.5


base_df_2['Department'].value_counts()

sales          3239
technical      2244
support        1821
IT              976
RandD           694
product_mng     686
marketing       673
accounting      621
hr              601
management      436
Name: Department, dtype: int64


chart, subsets = percentage_analysis(base_df_2, ['Department'])


summary = subsets['Department_True']
summary.rename(columns = {'1_percent':'Employee left = 1'}, inplace = True)
summary_desc = summary[['Employee left = 1']].describe()
summary_desc


hist_chart = summary['Employee left = 1'].hist(bins=6, grid=True)
hist_chart.axvline(x = summary_desc.iloc[1].values[0], color = 'red')
hist_chart.axvline(x = summary_desc.iloc[5].values[0], color = 'orange')
plt.title('Turnover rate dispersion by department')
plt.xlabel('Observed Percentage')

Text(0.5, 0, 'Observed Percentage')


chart, subsets = percentage_analysis(base_df_2, ['number_project', 'salary', 'Work_accident', 'promotion_last_5years'], label_type = 2, rot = 20)


chart, subsets = percentage_analysis(base_df_2, ['satisfaction_level_quartile', 'last_evaluation_quartile', 'average_montly_hours_quartile', 'time_spend_company'],label_type = 2, rot = 20)


chart, subsets = percentage_analysis(base_df_2, ['satisfaction_level_quartile', 'last_evaluation_quartile', 'average_montly_hours_quartile', 'time_spend_company'],grid_column = 'number_project', label_type = 2, rot = 20)


df2a = base_df_2[(base_df_2['number_project'].isin([3,4,5])) & (base_df_2['left'] == 1) & (base_df_2['time_spend_company'].isin([5,6]))].groupby(['salary', 'promotion_last_5years', 'Work_accident']).agg({'left':'count'})
df2b = base_df_2[(base_df_2['number_project'].isin([3,4,5])) & (base_df_2['left'] == 0)& (base_df_2['time_spend_company'].isin([5,6]))].groupby(['salary', 'promotion_last_5years', 'Work_accident']).agg({'left':'count'})
df2a = df2a.join(df2b, lsuffix='_1', rsuffix='_0')
df2a['left_percent'] = round(df2a['left_1'] / (df2a['left_1'] + df2a['left_0'])*100,2)
df2a


chart, subsets = percentage_analysis(base_df_2, ['Department','number_project','last_evaluation_quartile', 'time_spend_company'], grid_column = 'left', variable = 'satisfaction_level_quartile', label_type = 0)


sns.scatterplot(base_df_2, x="satisfaction_level", y="average_montly_hours", hue="left", alpha=0.05)

<AxesSubplot: xlabel='satisfaction_level', ylabel='average_montly_hours'>


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve


def perform_logistic_regression(df, target_feature = 'left', test_size = 0.25):
    """
    Description: 
        The function receives a dataset and a target column and performs a logistic regression
    Arguments:
        df: a pandas dataset.
        target_feature: the variable to be predicted, default: left
        test_size: determines the learn-test split size. Default: 0.25
    """
    y = df[target_feature]
    X = df.drop(target_feature, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)
    log_clf = LogisticRegression(random_state=42, max_iter=500).fit(X_train, y_train)
    y_pred = log_clf.predict(X_test)
    log_cm = confusion_matrix(y_test, y_pred, labels=log_clf.classes_)
    log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=log_clf.classes_)
    log_disp.plot(values_format='')
    plt.show()
    target_names = ['Predicted would not leave', 'Predicted would leave']
    print(classification_report(y_test, y_pred, target_names=target_names))
    return log_clf, X_train, X_test, y_train, y_test, y_pred


df_lg1 = base_df.copy()
df_lg1['salary'] = (df_lg1['salary'].astype('category').cat.set_categories(['low', 'medium', 'high']).cat.codes)
df_lg1 = pd.get_dummies(df_lg1, drop_first=False)
df_lg1 = df_lg1[(df_lg1['time_spend_company'] >= lower_limit) & (df_lg1['time_spend_company'] <= upper_limit)]
df_lg1.head()


log_clf, X_train, X_test, y_train, y_test, y_pred = perform_logistic_regression(df_lg1)

                           precision    recall  f1-score   support

Predicted would not leave       0.86      0.93      0.90      2321
    Predicted would leave       0.44      0.26      0.33       471

                 accuracy                           0.82      2792
                macro avg       0.65      0.60      0.61      2792
             weighted avg       0.79      0.82      0.80      2792


df_lg2 = base_df.copy()


df_lg2.loc[(base_df['left'] == 1) & (df_lg2['satisfaction_level'] < 0.5) & (df_lg2['average_montly_hours'] < 175), 'group'] = '1'
df_lg2.loc[(base_df['left'] == 1) & (df_lg2['satisfaction_level'] < 0.5) & (df_lg2['average_montly_hours'] >= 175), 'group'] = '2'
df_lg2.loc[(base_df['left'] == 1) & (df_lg2['satisfaction_level'] >= 0.5) & (df_lg2['average_montly_hours'] > 200), 'group'] = '3'
df_lg2.loc[base_df['left'] == 0 , 'group'] = '4'


df_lg2 = convert_to_zscore(df_lg2, target_col = ['satisfaction_level', 'average_montly_hours', 'last_evaluation'])
df_lg2['salary_score']= df_lg2['salary'].map({'low':2,'medium':1,'high':0})
df_lg2['work_accident_score']= df_lg2['Work_accident'].map({0:1,1:0})
df_lg2['promotion_last_5years_score']= df_lg2['promotion_last_5years'].map({0:1,1:0})
df_lg2['number_project_score']= df_lg2['number_project'].map({2:5,3:0,4:0,5:3,6:5,7:7})
df_lg2['time_spend_company_score']= df_lg2['time_spend_company'].map({2:0,3:1,4:2,5:4,6:2,7:1,8:0,9:0,10:0})
df_lg2.head()


df_lg2['leave_score'] = -(df_lg2['satisfaction_level_standardized']) +\
                            abs(df_lg2['last_evaluation_standardized']) +\
                            abs(df_lg2['average_montly_hours_standardized'])+\
                            df_lg2['salary_score'] +\
                            df_lg2['work_accident_score'] +\
                            df_lg2['number_project_score'] +\
                            df_lg2['promotion_last_5years_score'] +\
                            df_lg2['time_spend_company_score']


leave_score_summary = df_lg2.groupby('left').agg({'leave_score':['mean','min','max','std']})
leave_score_summary


leave_score_summary[('leave_score', 'mean')][0] - leave_score_summary[('leave_score', 'mean')][1]

-5.881314856141378


sns.scatterplot(df_lg2, x="leave_score", y=df_lg2["left"].astype(str),alpha=0.01,  hue="group")
plt.title('leave_score vs left: Round 1')
plt.xlabel('leave_score')

Text(0.5, 0, 'leave_score')


sns.regplot(df_lg2, x='leave_score', y='left', logistic=True)

<AxesSubplot: xlabel='leave_score', ylabel='left'>


log_clf, X_train, X_test, y_train, y_test, y_pred = perform_logistic_regression(df_lg2[['leave_score','left']])

                           precision    recall  f1-score   support

Predicted would not leave       0.92      0.95      0.93      2500
    Predicted would leave       0.69      0.57      0.62       498

                 accuracy                           0.89      2998
                macro avg       0.81      0.76      0.78      2998
             weighted avg       0.88      0.89      0.88      2998


df_lgt = df_lg2.copy()
relevance_levels = 4
summary = {'satisfaction_level':[], 'average_montly_hours':[], 'number_project':[], 'salary':[], 'work_accident':[],'promotion':[],'time_spend':[],'last_evaluation':[], 'mean_difference':[]}
for satisfaction_level in range(relevance_levels):
    for average_montly_hours in range(relevance_levels):
        for number_project in range(relevance_levels):
            for salary_score in range(relevance_levels):
                for work_accident in range(relevance_levels):
                    for promotion in range(relevance_levels):
                        for time_spend in range(relevance_levels):
                            for last_evaluation in range(relevance_levels):
                                df_lgt['leave_score'] = -(df_lgt['satisfaction_level_standardized'] * satisfaction_level) +\
                                                            abs(df_lgt['last_evaluation_standardized'] * last_evaluation) +\
                                                            abs(df_lgt['average_montly_hours_standardized'] * average_montly_hours)+\
                                                            (df_lgt['number_project_score']*number_project) +\
                                                            (df_lgt['salary_score']*salary_score) +\
                                                            (df_lgt['work_accident_score']*work_accident) +\
                                                            (df_lgt['promotion_last_5years_score'] * promotion) +\
                                                            (df_lgt['time_spend_company_score']*time_spend)
                                df_lgt = convert_to_zscore(df_lgt, target_col = ['leave_score'])
                                leave_coef_grouped = df_lgt.groupby('left').agg({'leave_score_standardized':['mean']})
                                mean_diff = leave_coef_grouped[('leave_score_standardized', 'mean')][0] - leave_coef_grouped[('leave_score_standardized', 'mean')][1]
                                summary['satisfaction_level'].append(satisfaction_level)
                                summary['average_montly_hours'].append(average_montly_hours)
                                summary['number_project'].append(number_project)
                                summary['salary'].append(salary_score)
                                summary['work_accident'].append(work_accident)
                                summary['promotion'].append(promotion)
                                summary['time_spend'].append(time_spend)
                                summary['last_evaluation'].append(last_evaluation)
                                summary['mean_difference'].append(mean_diff)
summary_df = pd.DataFrame(summary).sort_values(by = 'mean_difference', ascending=True).reset_index()
summary_df.head()


best_comb = summary_df[:1]
best_comb


df_lg2['leave_score'] = -(df_lg2['satisfaction_level_standardized'] * best_comb['satisfaction_level'][0]) +\
                        abs(df_lg2['last_evaluation_standardized'] * best_comb['last_evaluation'][0]) +\
                        abs(df_lg2['average_montly_hours_standardized'] * best_comb['average_montly_hours'][0]) +\
                        abs(df_lg2['number_project_score']*best_comb['number_project'][0]) +\
                        (df_lg2['salary_score']*best_comb['salary'][0]) +\
                        (df_lg2['work_accident_score']*best_comb['work_accident'][0]) +\
                        (df_lg2['promotion_last_5years_score'] * best_comb['promotion'][0]) +\
                        (df_lg2['time_spend_company_score'] * best_comb['time_spend'][0])


leave_score_summary = df_lg2.groupby('left').agg({'leave_score':['mean','min','max','std']})
leave_score_summary


sns.scatterplot(df_lg2, x="leave_score", y=df_lg2["left"].astype(str), alpha=0.03,  hue="group")
plt.title('leave_score vs left: Round 2')
plt.xlabel('leave_score') 
plt.axvline(leave_score_summary[('leave_score', 'mean')][0], color = 'red', linestyle ="--", label = 'didnt left mean')
plt.axvline(leave_score_summary[('leave_score', 'mean')][1], color = 'orange', linestyle ="--", label = 'left mean')
plt.legend(loc='center left')

<matplotlib.legend.Legend at 0x1b25c574c10>


log_clf, X_train, X_test, y_train, y_test, y_pred = perform_logistic_regression(df_lg2[['leave_score','left']])

                           precision    recall  f1-score   support

Predicted would not leave       0.93      0.95      0.94      2500
    Predicted would leave       0.71      0.65      0.68       498

                 accuracy                           0.90      2998
                macro avg       0.82      0.80      0.81      2998
             weighted avg       0.89      0.90      0.90      2998


roc_scores_dict = {'threshold':[], 'roc_auc_score':[]}
for i in range(10,55,5):
    pt = i/100
    fpr, tpr, thresholds = roc_curve(y_test, (log_clf.predict_proba(X_test)[:, 1] >=pt).astype(int))
    score = roc_auc_score(y_test, (log_clf.predict_proba(X_test)[:, 1] >=pt))
    roc_scores_dict['threshold'].append(pt)
    roc_scores_dict['roc_auc_score'].append(str(score))
    plt.plot(fpr, tpr, label = f'Threshold = {i}') 
ROC_summary = pd.DataFrame(roc_scores_dict)
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.title('ROC Curve') 
plt.legend() 
plt.show()


ROC_summary.sort_values(by = 'roc_auc_score', ascending = False, inplace = True)
ROC_summary.reset_index()
best_threshold = ROC_summary[:1]['threshold']


ROC_summary


log_cm = confusion_matrix(y_test, (log_clf.predict_proba(X_test)[:, 1] >= float(best_threshold)).astype(int), labels=log_clf.classes_)
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=log_clf.classes_)
log_disp.plot(values_format='')
plt.show()


target_names = ['Predicted would not leave', 'Predicted would leave']
print(classification_report(y_test, (log_clf.predict_proba(X_test)[:, 1] >= float(best_threshold)).astype(int), target_names=target_names))

                           precision    recall  f1-score   support

Predicted would not leave       0.97      0.87      0.92      2500
    Predicted would leave       0.56      0.86      0.68       498

                 accuracy                           0.87      2998
                macro avg       0.77      0.86      0.80      2998
             weighted avg       0.90      0.87      0.88      2998

Variable	Description
satisfaction_level	Employee-reported job satisfaction level [0–1]
last_evaluation	Score of employee's last performance review [0–1]
number_project	Number of projects employee contributes to
average_monthly_hours	Average number of hours employee worked per month
time_spend_company	How long the employee has been with the company (years)
Work_accident	Whether or not the employee experienced an accident while at work
left	Whether or not the employee left the company
promotion_last_5years	Whether or not the employee was promoted in the last 5 years
Department	The employee's department
salary	The employee's salary (U.S. dollars)

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000
mean	0.629658	0.716683	3.802852	200.473522	3.364857	0.154282	0.166041	0.016929
std	0.241070	0.168343	1.163238	48.727813	1.330240	0.361234	0.372133	0.129012
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.480000	0.570000	3.000000	157.000000	3.000000	0.000000	0.000000	0.000000
50%	0.660000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.860000	5.000000	243.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	Department	salary	satisfaction_level_quartile	last_evaluation_quartile	average_montly_hours_quartile	time_spend_company_quartile
0	0.38	0.53	2	157	3	1	sales	low	1.0	1.0	2.0	3.0
1	0.80	0.86	5	262	6	1	sales	medium	3.0	4.0	4.0	4.0
2	0.11	0.88	7	272	4	1	sales	medium	1.0	4.0	4.0	4.0
3	0.72	0.87	5	223	5	1	sales	low	3.0	4.0	3.0	4.0
4	0.37	0.52	2	159	3	1	sales	low	1.0	1.0	2.0	3.0

	satisfaction_level
	min	max	mean	count
satisfaction_level_quartile
1.0	0.09	0.47	0.286134	2887
2.0	0.48	0.65	0.566255	3020
3.0	0.66	0.81	0.736552	2918
4.0	0.82	1.00	0.904867	3166

	time_spend_company
	min	max	mean	count
time_spend_company_quartile
1.0	2	2	2.000000	2910
3.0	3	3	3.000000	5190
4.0	4	10	4.872269	3891

Google Advanced Data Analytics Program: Using Python to Predict Employee Turnover.¶

Introduction.¶

Exploratory Data Analysis.¶

Model Construction.¶

Logistic Regression - simple.¶

Logistic Regression - Feature engineering.¶

Conclusion¶

	leave_score
	mean	min	max	std
left
0	6.854953	0.44839	18.368268	2.887445
1	12.736268	3.41193	20.236775	2.664123

	index	satisfaction_level	average_montly_hours	number_project	salary	work_accident	promotion	time_spend	last_evaluation	mean_difference
0	31211	1	3	2	1	3	2	2	3	-1.650547
1	31195	1	3	2	1	3	1	2	3	-1.650330
2	31227	1	3	2	1	3	3	2	3	-1.650195
3	31147	1	3	2	1	2	2	2	3	-1.649883
4	31131	1	3	2	1	2	1	2	3	-1.649635

	leave_score
	mean	min	max	std
left
0	14.953789	2.027452	38.286883	5.599299
1	26.385953	6.763907	40.316098	4.739714

	threshold	roc_auc_score
2	0.20	0.8641228915662651
3	0.25	0.8624706827309236
4	0.30	0.8582265060240963
1	0.15	0.8581630522088353
5	0.35	0.8553823293172691
0	0.10	0.8548393574297188
6	0.40	0.8461261044176709
7	0.45	0.8286457831325301
8	0.50	0.7978971887550201

			left_1	left_0	left_percent
salary	promotion_last_5years	Work_accident
high	0	0	13	54	19.40
low	0	0	289	251	53.52
low	0	1	8	61	11.59
medium	0	0	185	281	39.70
	0	1	18	66	21.43
	1	1	1	3	25.00